Author

Bartosz Czyż

Published

September 29, 2025

1 Load packages

Show the code
library(tidymodels)
library(tidyverse)
library(stringr)
library(vip)
library(pdp)
library(DALEX)
library(DALEXtra)
library(bestNormalize)
library(rules)
library(baguette)
library(finetune)
library(doParallel)
library(skimr)
library(janitor)
library(corrplot)
library(naniar)
library(glmnet)
library(glmnet)
library(kernlab)
library(kknn)
library(ranger)
library(xgboost)
library(gt)

tidymodels_prefer()

2 Read

Show the code
house_data <- read_csv("train.csv") |>
  clean_names() |>
  mutate(id = as.character(id))

miss_var_summary(house_data)
# A tibble: 81 × 3
   variable      n_miss pct_miss
   <chr>          <int>    <num>
 1 pool_qc         1453    99.5 
 2 misc_feature    1406    96.3 
 3 alley           1369    93.8 
 4 fence           1179    80.8 
 5 fireplace_qu     690    47.3 
 6 lot_frontage     259    17.7 
 7 garage_type       81     5.55
 8 garage_yr_blt     81     5.55
 9 garage_finish     81     5.55
10 garage_qual       81     5.55
# ℹ 71 more rows
Show the code
# Remove columns with too many missing values
house_data <- house_data %>%
  select(-c(pool_qc, misc_feature, alley, fence, fireplace_qu,
            utilities, street, land_slope, roof_matl, heating,
            electrical, functional)) %>%
  mutate(across(where(is.character), as.factor)) %>%
  mutate(ms_sub_class = as.factor(ms_sub_class)) %>%
  mutate(across(where(is.factor), ~fct_explicit_na(., na_level = "Missing"))) %>%
  select(!where(~is.factor(.) && n_distinct(.) == 1))

house_data |> summary()
       id        ms_sub_class   ms_zoning     lot_frontage       lot_area     
 1      :   1   20     :536   C (all):  10   Min.   : 21.00   Min.   :  1300  
 10     :   1   60     :299   FV     :  65   1st Qu.: 59.00   1st Qu.:  7554  
 100    :   1   50     :144   RH     :  16   Median : 69.00   Median :  9478  
 1000   :   1   120    : 87   RL     :1151   Mean   : 70.05   Mean   : 10517  
 1001   :   1   30     : 69   RM     : 218   3rd Qu.: 80.00   3rd Qu.: 11602  
 1002   :   1   160    : 63                  Max.   :313.00   Max.   :215245  
 (Other):1454   (Other):262                  NA's   :259                      
 lot_shape land_contour   lot_config    neighborhood   condition1  
 IR1:484   Bnk:  63     Corner : 263   NAmes  :225   Norm   :1260  
 IR2: 41   HLS:  50     CulDSac:  94   CollgCr:150   Feedr  :  81  
 IR3: 10   Low:  36     FR2    :  47   OldTown:113   Artery :  48  
 Reg:925   Lvl:1311     FR3    :   4   Edwards:100   RRAn   :  26  
                        Inside :1052   Somerst: 86   PosN   :  19  
                                       Gilbert: 79   RRAe   :  11  
                                       (Other):707   (Other):  15  
   condition2    bldg_type     house_style   overall_qual     overall_cond  
 Norm   :1445   1Fam  :1220   1Story :726   Min.   : 1.000   Min.   :1.000  
 Feedr  :   6   2fmCon:  31   2Story :445   1st Qu.: 5.000   1st Qu.:5.000  
 Artery :   2   Duplex:  52   1.5Fin :154   Median : 6.000   Median :5.000  
 PosN   :   2   Twnhs :  43   SLvl   : 65   Mean   : 6.099   Mean   :5.575  
 RRNn   :   2   TwnhsE: 114   SFoyer : 37   3rd Qu.: 7.000   3rd Qu.:6.000  
 PosA   :   1                 1.5Unf : 14   Max.   :10.000   Max.   :9.000  
 (Other):   2                 (Other): 19                                   
   year_built   year_remod_add   roof_style    exterior1st   exterior2nd 
 Min.   :1872   Min.   :1950   Flat   :  13   VinylSd:515   VinylSd:504  
 1st Qu.:1954   1st Qu.:1967   Gable  :1141   HdBoard:222   MetalSd:214  
 Median :1973   Median :1994   Gambrel:  11   MetalSd:220   HdBoard:207  
 Mean   :1971   Mean   :1985   Hip    : 286   Wd Sdng:206   Wd Sdng:197  
 3rd Qu.:2000   3rd Qu.:2004   Mansard:   7   Plywood:108   Plywood:142  
 Max.   :2010   Max.   :2010   Shed   :   2   CemntBd: 61   CmentBd: 60  
                                              (Other):128   (Other):136  
  mas_vnr_type  mas_vnr_area    exter_qual exter_cond  foundation 
 BrkCmn : 15   Min.   :   0.0   Ex: 52     Ex:   3    BrkTil:146  
 BrkFace:445   1st Qu.:   0.0   Fa: 14     Fa:  28    CBlock:634  
 None   :864   Median :   0.0   Gd:488     Gd: 146    PConc :647  
 Stone  :128   Mean   : 103.7   TA:906     Po:   1    Slab  : 24  
 Missing:  8   3rd Qu.: 166.0              TA:1282    Stone :  6  
               Max.   :1600.0                         Wood  :  3  
               NA's   :8                                          
   bsmt_qual     bsmt_cond    bsmt_exposure bsmt_fin_type1  bsmt_fin_sf1   
 Ex     :121   Fa     :  45   Av     :221   ALQ    :220    Min.   :   0.0  
 Fa     : 35   Gd     :  65   Gd     :134   BLQ    :148    1st Qu.:   0.0  
 Gd     :618   Po     :   2   Mn     :114   GLQ    :418    Median : 383.5  
 TA     :649   TA     :1311   No     :953   LwQ    : 74    Mean   : 443.6  
 Missing: 37   Missing:  37   Missing: 38   Rec    :133    3rd Qu.: 712.2  
                                            Unf    :430    Max.   :5644.0  
                                            Missing: 37                    
 bsmt_fin_type2  bsmt_fin_sf2      bsmt_unf_sf     total_bsmt_sf    heating_qc
 ALQ    :  19   Min.   :   0.00   Min.   :   0.0   Min.   :   0.0   Ex:741    
 BLQ    :  33   1st Qu.:   0.00   1st Qu.: 223.0   1st Qu.: 795.8   Fa: 49    
 GLQ    :  14   Median :   0.00   Median : 477.5   Median : 991.5   Gd:241    
 LwQ    :  46   Mean   :  46.55   Mean   : 567.2   Mean   :1057.4   Po:  1    
 Rec    :  54   3rd Qu.:   0.00   3rd Qu.: 808.0   3rd Qu.:1298.2   TA:428    
 Unf    :1256   Max.   :1474.00   Max.   :2336.0   Max.   :6110.0             
 Missing:  38                                                                 
 central_air  x1st_flr_sf    x2nd_flr_sf   low_qual_fin_sf    gr_liv_area  
 N:  95      Min.   : 334   Min.   :   0   Min.   :  0.000   Min.   : 334  
 Y:1365      1st Qu.: 882   1st Qu.:   0   1st Qu.:  0.000   1st Qu.:1130  
             Median :1087   Median :   0   Median :  0.000   Median :1464  
             Mean   :1163   Mean   : 347   Mean   :  5.845   Mean   :1515  
             3rd Qu.:1391   3rd Qu.: 728   3rd Qu.:  0.000   3rd Qu.:1777  
             Max.   :4692   Max.   :2065   Max.   :572.000   Max.   :5642  
                                                                           
 bsmt_full_bath   bsmt_half_bath      full_bath       half_bath     
 Min.   :0.0000   Min.   :0.00000   Min.   :0.000   Min.   :0.0000  
 1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:1.000   1st Qu.:0.0000  
 Median :0.0000   Median :0.00000   Median :2.000   Median :0.0000  
 Mean   :0.4253   Mean   :0.05753   Mean   :1.565   Mean   :0.3829  
 3rd Qu.:1.0000   3rd Qu.:0.00000   3rd Qu.:2.000   3rd Qu.:1.0000  
 Max.   :3.0000   Max.   :2.00000   Max.   :3.000   Max.   :2.0000  
                                                                    
 bedroom_abv_gr  kitchen_abv_gr  kitchen_qual tot_rms_abv_grd    fireplaces   
 Min.   :0.000   Min.   :0.000   Ex:100       Min.   : 2.000   Min.   :0.000  
 1st Qu.:2.000   1st Qu.:1.000   Fa: 39       1st Qu.: 5.000   1st Qu.:0.000  
 Median :3.000   Median :1.000   Gd:586       Median : 6.000   Median :1.000  
 Mean   :2.866   Mean   :1.047   TA:735       Mean   : 6.518   Mean   :0.613  
 3rd Qu.:3.000   3rd Qu.:1.000                3rd Qu.: 7.000   3rd Qu.:1.000  
 Max.   :8.000   Max.   :3.000                Max.   :14.000   Max.   :3.000  
                                                                              
  garage_type  garage_yr_blt  garage_finish  garage_cars     garage_area    
 2Types :  6   Min.   :1900   Fin    :352   Min.   :0.000   Min.   :   0.0  
 Attchd :870   1st Qu.:1961   RFn    :422   1st Qu.:1.000   1st Qu.: 334.5  
 Basment: 19   Median :1980   Unf    :605   Median :2.000   Median : 480.0  
 BuiltIn: 88   Mean   :1979   Missing: 81   Mean   :1.767   Mean   : 473.0  
 CarPort:  9   3rd Qu.:2002                 3rd Qu.:2.000   3rd Qu.: 576.0  
 Detchd :387   Max.   :2010                 Max.   :4.000   Max.   :1418.0  
 Missing: 81   NA's   :81                                                   
  garage_qual    garage_cond   paved_drive  wood_deck_sf    open_porch_sf   
 Ex     :   3   Ex     :   2   N:  90      Min.   :  0.00   Min.   :  0.00  
 Fa     :  48   Fa     :  35   P:  30      1st Qu.:  0.00   1st Qu.:  0.00  
 Gd     :  14   Gd     :   9   Y:1340      Median :  0.00   Median : 25.00  
 Po     :   3   Po     :   7               Mean   : 94.24   Mean   : 46.66  
 TA     :1311   TA     :1326               3rd Qu.:168.00   3rd Qu.: 68.00  
 Missing:  81   Missing:  81               Max.   :857.00   Max.   :547.00  
                                                                            
 enclosed_porch    x3ssn_porch      screen_porch      pool_area      
 Min.   :  0.00   Min.   :  0.00   Min.   :  0.00   Min.   :  0.000  
 1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.:  0.000  
 Median :  0.00   Median :  0.00   Median :  0.00   Median :  0.000  
 Mean   : 21.95   Mean   :  3.41   Mean   : 15.06   Mean   :  2.759  
 3rd Qu.:  0.00   3rd Qu.:  0.00   3rd Qu.:  0.00   3rd Qu.:  0.000  
 Max.   :552.00   Max.   :508.00   Max.   :480.00   Max.   :738.000  
                                                                     
    misc_val           mo_sold          yr_sold       sale_type   
 Min.   :    0.00   Min.   : 1.000   Min.   :2006   WD     :1267  
 1st Qu.:    0.00   1st Qu.: 5.000   1st Qu.:2007   New    : 122  
 Median :    0.00   Median : 6.000   Median :2008   COD    :  43  
 Mean   :   43.49   Mean   : 6.322   Mean   :2008   ConLD  :   9  
 3rd Qu.:    0.00   3rd Qu.: 8.000   3rd Qu.:2009   ConLI  :   5  
 Max.   :15500.00   Max.   :12.000   Max.   :2010   ConLw  :   5  
                                                    (Other):   9  
 sale_condition   sale_price    
 Abnorml: 101   Min.   : 34900  
 AdjLand:   4   1st Qu.:129975  
 Alloca :  12   Median :163000  
 Family :  20   Mean   :180921  
 Normal :1198   3rd Qu.:214000  
 Partial: 125   Max.   :755000  
                                
Show the code
glimpse(house_data)
Rows: 1,460
Columns: 69
$ id              <fct> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,…
$ ms_sub_class    <fct> 60, 20, 60, 70, 60, 50, 20, 60, 50, 190, 20, 60, 20, 2…
$ ms_zoning       <fct> RL, RL, RL, RL, RL, RL, RL, RL, RM, RL, RL, RL, RL, RL…
$ lot_frontage    <dbl> 65, 80, 68, 60, 84, 85, 75, NA, 51, 50, 70, 85, NA, 91…
$ lot_area        <dbl> 8450, 9600, 11250, 9550, 14260, 14115, 10084, 10382, 6…
$ lot_shape       <fct> Reg, Reg, IR1, IR1, IR1, IR1, Reg, IR1, Reg, Reg, Reg,…
$ land_contour    <fct> Lvl, Lvl, Lvl, Lvl, Lvl, Lvl, Lvl, Lvl, Lvl, Lvl, Lvl,…
$ lot_config      <fct> Inside, FR2, Inside, Corner, FR2, Inside, Inside, Corn…
$ neighborhood    <fct> CollgCr, Veenker, CollgCr, Crawfor, NoRidge, Mitchel, …
$ condition1      <fct> Norm, Feedr, Norm, Norm, Norm, Norm, Norm, PosN, Arter…
$ condition2      <fct> Norm, Norm, Norm, Norm, Norm, Norm, Norm, Norm, Norm, …
$ bldg_type       <fct> 1Fam, 1Fam, 1Fam, 1Fam, 1Fam, 1Fam, 1Fam, 1Fam, 1Fam, …
$ house_style     <fct> 2Story, 1Story, 2Story, 2Story, 2Story, 1.5Fin, 1Story…
$ overall_qual    <dbl> 7, 6, 7, 7, 8, 5, 8, 7, 7, 5, 5, 9, 5, 7, 6, 7, 6, 4, …
$ overall_cond    <dbl> 5, 8, 5, 5, 5, 5, 5, 6, 5, 6, 5, 5, 6, 5, 5, 8, 7, 5, …
$ year_built      <dbl> 2003, 1976, 2001, 1915, 2000, 1993, 2004, 1973, 1931, …
$ year_remod_add  <dbl> 2003, 1976, 2002, 1970, 2000, 1995, 2005, 1973, 1950, …
$ roof_style      <fct> Gable, Gable, Gable, Gable, Gable, Gable, Gable, Gable…
$ exterior1st     <fct> VinylSd, MetalSd, VinylSd, Wd Sdng, VinylSd, VinylSd, …
$ exterior2nd     <fct> VinylSd, MetalSd, VinylSd, Wd Shng, VinylSd, VinylSd, …
$ mas_vnr_type    <fct> BrkFace, None, BrkFace, None, BrkFace, None, Stone, St…
$ mas_vnr_area    <dbl> 196, 0, 162, 0, 350, 0, 186, 240, 0, 0, 0, 286, 0, 306…
$ exter_qual      <fct> Gd, TA, Gd, TA, Gd, TA, Gd, TA, TA, TA, TA, Ex, TA, Gd…
$ exter_cond      <fct> TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA…
$ foundation      <fct> PConc, CBlock, PConc, BrkTil, PConc, Wood, PConc, CBlo…
$ bsmt_qual       <fct> Gd, Gd, Gd, TA, Gd, Gd, Ex, Gd, TA, TA, TA, Ex, TA, Gd…
$ bsmt_cond       <fct> TA, TA, TA, Gd, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA…
$ bsmt_exposure   <fct> No, Gd, Mn, No, Av, No, Av, Mn, No, No, No, No, No, Av…
$ bsmt_fin_type1  <fct> GLQ, ALQ, GLQ, ALQ, GLQ, GLQ, GLQ, ALQ, Unf, GLQ, Rec,…
$ bsmt_fin_sf1    <dbl> 706, 978, 486, 216, 655, 732, 1369, 859, 0, 851, 906, …
$ bsmt_fin_type2  <fct> Unf, Unf, Unf, Unf, Unf, Unf, Unf, BLQ, Unf, Unf, Unf,…
$ bsmt_fin_sf2    <dbl> 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ bsmt_unf_sf     <dbl> 150, 284, 434, 540, 490, 64, 317, 216, 952, 140, 134, …
$ total_bsmt_sf   <dbl> 856, 1262, 920, 756, 1145, 796, 1686, 1107, 952, 991, …
$ heating_qc      <fct> Ex, Ex, Ex, Gd, Ex, Ex, Ex, Ex, Gd, Ex, Ex, Ex, TA, Ex…
$ central_air     <fct> Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, …
$ x1st_flr_sf     <dbl> 856, 1262, 920, 961, 1145, 796, 1694, 1107, 1022, 1077…
$ x2nd_flr_sf     <dbl> 854, 0, 866, 756, 1053, 566, 0, 983, 752, 0, 0, 1142, …
$ low_qual_fin_sf <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ gr_liv_area     <dbl> 1710, 1262, 1786, 1717, 2198, 1362, 1694, 2090, 1774, …
$ bsmt_full_bath  <dbl> 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, …
$ bsmt_half_bath  <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ full_bath       <dbl> 2, 2, 2, 1, 2, 1, 2, 2, 2, 1, 1, 3, 1, 2, 1, 1, 1, 2, …
$ half_bath       <dbl> 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, …
$ bedroom_abv_gr  <dbl> 3, 3, 3, 3, 4, 1, 3, 3, 2, 2, 3, 4, 2, 3, 2, 2, 2, 2, …
$ kitchen_abv_gr  <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 2, …
$ kitchen_qual    <fct> Gd, TA, Gd, Gd, Gd, TA, Gd, TA, TA, TA, TA, Ex, TA, Gd…
$ tot_rms_abv_grd <dbl> 8, 6, 6, 7, 9, 5, 7, 7, 8, 5, 5, 11, 4, 7, 5, 5, 5, 6,…
$ fireplaces      <dbl> 0, 1, 1, 1, 1, 0, 1, 2, 2, 2, 0, 2, 0, 1, 1, 0, 1, 0, …
$ garage_type     <fct> Attchd, Attchd, Attchd, Detchd, Attchd, Attchd, Attchd…
$ garage_yr_blt   <dbl> 2003, 1976, 2001, 1998, 2000, 1993, 2004, 1973, 1931, …
$ garage_finish   <fct> RFn, RFn, RFn, Unf, RFn, Unf, RFn, RFn, Unf, RFn, Unf,…
$ garage_cars     <dbl> 2, 2, 2, 3, 3, 2, 2, 2, 2, 1, 1, 3, 1, 3, 1, 2, 2, 2, …
$ garage_area     <dbl> 548, 460, 608, 642, 836, 480, 636, 484, 468, 205, 384,…
$ garage_qual     <fct> TA, TA, TA, TA, TA, TA, TA, TA, Fa, Gd, TA, TA, TA, TA…
$ garage_cond     <fct> TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA, TA…
$ paved_drive     <fct> Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, Y, …
$ wood_deck_sf    <dbl> 0, 298, 0, 0, 192, 40, 255, 235, 90, 0, 0, 147, 140, 1…
$ open_porch_sf   <dbl> 61, 0, 42, 35, 84, 30, 57, 204, 0, 4, 0, 21, 0, 33, 21…
$ enclosed_porch  <dbl> 0, 0, 0, 272, 0, 0, 0, 228, 205, 0, 0, 0, 0, 0, 176, 0…
$ x3ssn_porch     <dbl> 0, 0, 0, 0, 0, 320, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ screen_porch    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 176, 0, 0, 0, 0, 0…
$ pool_area       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ misc_val        <dbl> 0, 0, 0, 0, 0, 700, 0, 350, 0, 0, 0, 0, 0, 0, 0, 0, 70…
$ mo_sold         <dbl> 2, 5, 9, 2, 12, 10, 8, 11, 4, 1, 2, 7, 9, 8, 5, 7, 3, …
$ yr_sold         <dbl> 2008, 2007, 2008, 2006, 2008, 2009, 2007, 2009, 2008, …
$ sale_type       <fct> WD, WD, WD, WD, WD, WD, WD, WD, WD, WD, WD, New, WD, N…
$ sale_condition  <fct> Normal, Normal, Normal, Abnorml, Normal, Normal, Norma…
$ sale_price      <dbl> 208500, 181500, 223500, 140000, 250000, 143000, 307000…
Show the code
skim(house_data)
Data summary
Name house_data
Number of rows 1460
Number of columns 69
_______________________
Column type frequency:
factor 33
numeric 36
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
id 0 1 FALSE 1460 1: 1, 10: 1, 100: 1, 100: 1
ms_sub_class 0 1 FALSE 15 20: 536, 60: 299, 50: 144, 120: 87
ms_zoning 0 1 FALSE 5 RL: 1151, RM: 218, FV: 65, RH: 16
lot_shape 0 1 FALSE 4 Reg: 925, IR1: 484, IR2: 41, IR3: 10
land_contour 0 1 FALSE 4 Lvl: 1311, Bnk: 63, HLS: 50, Low: 36
lot_config 0 1 FALSE 5 Ins: 1052, Cor: 263, Cul: 94, FR2: 47
neighborhood 0 1 FALSE 25 NAm: 225, Col: 150, Old: 113, Edw: 100
condition1 0 1 FALSE 9 Nor: 1260, Fee: 81, Art: 48, RRA: 26
condition2 0 1 FALSE 8 Nor: 1445, Fee: 6, Art: 2, Pos: 2
bldg_type 0 1 FALSE 5 1Fa: 1220, Twn: 114, Dup: 52, Twn: 43
house_style 0 1 FALSE 8 1St: 726, 2St: 445, 1.5: 154, SLv: 65
roof_style 0 1 FALSE 6 Gab: 1141, Hip: 286, Fla: 13, Gam: 11
exterior1st 0 1 FALSE 15 Vin: 515, HdB: 222, Met: 220, Wd : 206
exterior2nd 0 1 FALSE 16 Vin: 504, Met: 214, HdB: 207, Wd : 197
mas_vnr_type 0 1 FALSE 5 Non: 864, Brk: 445, Sto: 128, Brk: 15
exter_qual 0 1 FALSE 4 TA: 906, Gd: 488, Ex: 52, Fa: 14
exter_cond 0 1 FALSE 5 TA: 1282, Gd: 146, Fa: 28, Ex: 3
foundation 0 1 FALSE 6 PCo: 647, CBl: 634, Brk: 146, Sla: 24
bsmt_qual 0 1 FALSE 5 TA: 649, Gd: 618, Ex: 121, Mis: 37
bsmt_cond 0 1 FALSE 5 TA: 1311, Gd: 65, Fa: 45, Mis: 37
bsmt_exposure 0 1 FALSE 5 No: 953, Av: 221, Gd: 134, Mn: 114
bsmt_fin_type1 0 1 FALSE 7 Unf: 430, GLQ: 418, ALQ: 220, BLQ: 148
bsmt_fin_type2 0 1 FALSE 7 Unf: 1256, Rec: 54, LwQ: 46, Mis: 38
heating_qc 0 1 FALSE 5 Ex: 741, TA: 428, Gd: 241, Fa: 49
central_air 0 1 FALSE 2 Y: 1365, N: 95
kitchen_qual 0 1 FALSE 4 TA: 735, Gd: 586, Ex: 100, Fa: 39
garage_type 0 1 FALSE 7 Att: 870, Det: 387, Bui: 88, Mis: 81
garage_finish 0 1 FALSE 4 Unf: 605, RFn: 422, Fin: 352, Mis: 81
garage_qual 0 1 FALSE 6 TA: 1311, Mis: 81, Fa: 48, Gd: 14
garage_cond 0 1 FALSE 6 TA: 1326, Mis: 81, Fa: 35, Gd: 9
paved_drive 0 1 FALSE 3 Y: 1340, N: 90, P: 30
sale_type 0 1 FALSE 9 WD: 1267, New: 122, COD: 43, Con: 9
sale_condition 0 1 FALSE 6 Nor: 1198, Par: 125, Abn: 101, Fam: 20

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
lot_frontage 259 0.82 70.05 24.28 21 59.00 69.0 80.00 313 ▇▃▁▁▁
lot_area 0 1.00 10516.83 9981.26 1300 7553.50 9478.5 11601.50 215245 ▇▁▁▁▁
overall_qual 0 1.00 6.10 1.38 1 5.00 6.0 7.00 10 ▁▂▇▅▁
overall_cond 0 1.00 5.58 1.11 1 5.00 5.0 6.00 9 ▁▁▇▅▁
year_built 0 1.00 1971.27 30.20 1872 1954.00 1973.0 2000.00 2010 ▁▂▃▆▇
year_remod_add 0 1.00 1984.87 20.65 1950 1967.00 1994.0 2004.00 2010 ▅▂▂▃▇
mas_vnr_area 8 0.99 103.69 181.07 0 0.00 0.0 166.00 1600 ▇▁▁▁▁
bsmt_fin_sf1 0 1.00 443.64 456.10 0 0.00 383.5 712.25 5644 ▇▁▁▁▁
bsmt_fin_sf2 0 1.00 46.55 161.32 0 0.00 0.0 0.00 1474 ▇▁▁▁▁
bsmt_unf_sf 0 1.00 567.24 441.87 0 223.00 477.5 808.00 2336 ▇▅▂▁▁
total_bsmt_sf 0 1.00 1057.43 438.71 0 795.75 991.5 1298.25 6110 ▇▃▁▁▁
x1st_flr_sf 0 1.00 1162.63 386.59 334 882.00 1087.0 1391.25 4692 ▇▅▁▁▁
x2nd_flr_sf 0 1.00 346.99 436.53 0 0.00 0.0 728.00 2065 ▇▃▂▁▁
low_qual_fin_sf 0 1.00 5.84 48.62 0 0.00 0.0 0.00 572 ▇▁▁▁▁
gr_liv_area 0 1.00 1515.46 525.48 334 1129.50 1464.0 1776.75 5642 ▇▇▁▁▁
bsmt_full_bath 0 1.00 0.43 0.52 0 0.00 0.0 1.00 3 ▇▆▁▁▁
bsmt_half_bath 0 1.00 0.06 0.24 0 0.00 0.0 0.00 2 ▇▁▁▁▁
full_bath 0 1.00 1.57 0.55 0 1.00 2.0 2.00 3 ▁▇▁▇▁
half_bath 0 1.00 0.38 0.50 0 0.00 0.0 1.00 2 ▇▁▅▁▁
bedroom_abv_gr 0 1.00 2.87 0.82 0 2.00 3.0 3.00 8 ▁▇▂▁▁
kitchen_abv_gr 0 1.00 1.05 0.22 0 1.00 1.0 1.00 3 ▁▇▁▁▁
tot_rms_abv_grd 0 1.00 6.52 1.63 2 5.00 6.0 7.00 14 ▂▇▇▁▁
fireplaces 0 1.00 0.61 0.64 0 0.00 1.0 1.00 3 ▇▇▁▁▁
garage_yr_blt 81 0.94 1978.51 24.69 1900 1961.00 1980.0 2002.00 2010 ▁▁▅▅▇
garage_cars 0 1.00 1.77 0.75 0 1.00 2.0 2.00 4 ▁▃▇▂▁
garage_area 0 1.00 472.98 213.80 0 334.50 480.0 576.00 1418 ▂▇▃▁▁
wood_deck_sf 0 1.00 94.24 125.34 0 0.00 0.0 168.00 857 ▇▂▁▁▁
open_porch_sf 0 1.00 46.66 66.26 0 0.00 25.0 68.00 547 ▇▁▁▁▁
enclosed_porch 0 1.00 21.95 61.12 0 0.00 0.0 0.00 552 ▇▁▁▁▁
x3ssn_porch 0 1.00 3.41 29.32 0 0.00 0.0 0.00 508 ▇▁▁▁▁
screen_porch 0 1.00 15.06 55.76 0 0.00 0.0 0.00 480 ▇▁▁▁▁
pool_area 0 1.00 2.76 40.18 0 0.00 0.0 0.00 738 ▇▁▁▁▁
misc_val 0 1.00 43.49 496.12 0 0.00 0.0 0.00 15500 ▇▁▁▁▁
mo_sold 0 1.00 6.32 2.70 1 5.00 6.0 8.00 12 ▃▆▇▃▃
yr_sold 0 1.00 2007.82 1.33 2006 2007.00 2008.0 2009.00 2010 ▇▇▇▇▅
sale_price 0 1.00 180921.20 79442.50 34900 129975.00 163000.0 214000.00 755000 ▇▅▁▁▁

3 Data Exploration

Show the code
# Target variable distribution
p1 <- ggplot(house_data, aes(x = sale_price)) +
  geom_histogram(fill = "blue", alpha = 0.6, bins = 30) +
  labs(title = "Distribution of Sale Prices")

p2 <- ggplot(house_data, aes(x = log(sale_price))) +
  geom_histogram(fill = "red", alpha = 0.6, bins = 30) +
  labs(title = "Distribution of Log-Transformed Sale Prices")

gridExtra::grid.arrange(p1, p2, ncol = 2)

Show the code
# Correlation matrix for numeric variables
numeric_vars <- house_data |>
  select(where(is.numeric)) |>
  cor(use = "complete.obs")
corrplot(numeric_vars, method = "circle", type = "upper", tl.cex = 0.6)

Show the code
# Missing values analysis
missing_summary <- house_data |>
  summarise(across(everything(), ~sum(is.na(.)))) |>
  pivot_longer(everything(), names_to = "variable", values_to = "missing_count") |>
  filter(missing_count > 0) |>
  arrange(desc(missing_count))

ggplot(missing_summary, aes(x = reorder(variable, -missing_count), y = missing_count)) +
  geom_col(fill = "red", alpha = 0.6) +
  coord_flip() +
  labs(title = "Variables with Missing Values", x = "Variable", y = "Missing Count")

4 Data splitting

Show the code
set.seed(123)
house_split <- initial_split(house_data, strata = sale_price, prop = 0.8)
house_train <- training(house_split)
house_test  <- testing(house_split)

house_folds <- vfold_cv(house_train, strata = sale_price, v = 5, repeats = 3)

5 Recipe Definitions

Show the code
# Basic recipe for linear models, SVM, KNN, etc.
basic_rec <- recipe(sale_price ~ ., data = house_train) %>%
  update_role(id, new_role = "ID") %>%
  step_rm(id) %>%
  step_mutate(ms_sub_class = as.factor(ms_sub_class)) %>%
  step_novel(all_nominal_predictors()) %>%
  step_unknown(all_nominal_predictors()) %>%
  step_impute_knn(all_numeric_predictors()) %>%
  step_log(sale_price, offset = 1) %>%
  step_nzv(all_predictors()) %>%
  step_dummy(all_nominal_predictors()) %>%
  # Add this step to remove zero-variance columns after dummy coding
  step_zv(all_predictors()) %>%
  step_normalize(all_numeric_predictors())

6 Model definitions

Show the code
# 1. Linear model with regularization
lm_spec <- linear_reg(penalty = tune(), mixture = tune()) %>%
  set_engine("glmnet")

# 2. Random Forest
rf_spec <- rand_forest(
  mtry = tune(),
  trees = tune(),
  min_n = tune()
) %>%
  set_engine("ranger") %>%
  set_mode("regression")

# 3. XGBoost
xgb_spec <- boost_tree(
  tree_depth = tune(),
  learn_rate = tune(),
  loss_reduction = tune(),
  min_n = tune(),
  sample_size = tune(),
  trees = tune()
) %>%
  set_engine("xgboost") %>%
  set_mode("regression")

# 4. Support Vector Machine
svm_spec <- svm_rbf(
  cost = tune(),
  rbf_sigma = tune()
) %>%
  set_engine("kernlab") %>%
  set_mode("regression")

# 5. K-Nearest Neighbors
knn_spec <- nearest_neighbor(
  neighbors = tune(),
  weight_func = tune(),
  dist_power = tune()
) %>%
  set_engine("kknn") %>%
  set_mode("regression")

7 Workflow set

Show the code
# Create workflow set with appropriate combinations
house_wfs <- workflow_set(
  preproc = list(
    basic = basic_rec
  ),
  models = list(
    lm = lm_spec,
    rf = rf_spec,
    xgb = xgb_spec,
    svm = svm_spec,
    knn = knn_spec
  ),
  cross = TRUE
)

house_wfs
# A workflow set/tibble: 5 × 4
  wflow_id  info             option    result    
  <chr>     <list>           <list>    <list>    
1 basic_lm  <tibble [1 × 4]> <opts[0]> <list [0]>
2 basic_rf  <tibble [1 × 4]> <opts[0]> <list [0]>
3 basic_xgb <tibble [1 × 4]> <opts[0]> <list [0]>
4 basic_svm <tibble [1 × 4]> <opts[0]> <list [0]>
5 basic_knn <tibble [1 × 4]> <opts[0]> <list [0]>

8 Tunning Setup

Show the code
house_wfs$wflow_id
[1] "basic_lm"  "basic_rf"  "basic_xgb" "basic_svm" "basic_knn"
Show the code
# Set up parallel processing
cores <- parallel::detectCores(logical = FALSE) - 1
cl <- makePSOCKcluster(cores)
registerDoParallel(cl)

# Control for racing
race_ctrl <- control_race(
  save_pred = TRUE,
  parallel_over = "everything",
  save_workflow = FALSE
)

9 Model tuning

Show the code
## Tune models
# set.seed(456)
# time <- Sys.time()

#tune_results <- house_wfs %>%
#  workflow_map(
#    "tune_race_anova",
#    seed = 1503,
#    resamples = house_folds,
#    grid = 15,  # Reduced for demonstration
#    control = race_ctrl,
#    verbose = TRUE,
#    metrics = metric_set(rmse, mae, rsq)
#  )

#stopCluster(cl)
#Sys.time() - time

#save(tune_results, file = "house_tune_results.Rdata")

10 Result Analysis

Show the code
load("house_tune_results.Rdata")

# Check which models worked
tune_results %>%
  rank_results(select_best = TRUE) %>%
  filter(.metric == "rmse") %>%
  select(wflow_id, mean, std_err, model, preprocessor) %>%
  knitr::kable()
wflow_id mean std_err model preprocessor
basic_xgb 0.1291433 0.0051342 boost_tree recipe
basic_rf 0.1395734 0.0050172 rand_forest recipe
basic_svm 0.1395907 0.0075550 svm_rbf recipe
basic_lm 0.1468725 0.0105441 linear_reg recipe
basic_knn 0.1781342 0.0046869 nearest_neighbor recipe
Show the code
# Visualization of results
autoplot(tune_results, select_best = TRUE, metric = "rmse") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Show the code
# Collect best models
best_results <- tune_results %>%
  rank_results(select_best = TRUE) %>%
  filter(.metric == "rmse") %>%
  group_by(wflow_id) %>%
  slice(1) %>%
  ungroup()

metrics <- metric_set(rmse, rsq, mae)

best_models <- list()
for (i in 1:nrow(best_results)) {
  workflow_id <- best_results$wflow_id[i]
  config <- best_results$.config[i]
  
  best_models[[workflow_id]] <- tune_results %>%
    extract_workflow(id = workflow_id) %>%
    finalize_workflow(select_best(
      extract_workflow_set_result(tune_results, id = workflow_id),
      metric = "rmse"
    )) %>%
    last_fit(split = house_split, metrics = metrics) 
}
save(best_models, file = "best_models.Rdata")

11 Final Evaluation

Show the code
# Collect metrics
final_metrics <- best_models %>%
  map_dfr(~collect_metrics(.x), .id = "wflow_id") %>%
  filter(.metric == "rmse") %>%
  arrange(.estimate)

final_metrics %>%
  knitr::kable()
wflow_id .metric .estimator .estimate .config
basic_xgb rmse standard 0.1325482 pre0_mod0_post0
basic_lm rmse standard 0.1355971 pre0_mod0_post0
basic_svm rmse standard 0.1362208 pre0_mod0_post0
basic_rf rmse standard 0.1424286 pre0_mod0_post0
basic_knn rmse standard 0.1810107 pre0_mod0_post0
Show the code
# Variable importance for top models
top_models <- final_metrics %>%
  slice(1:3) %>%
  pull(wflow_id)

for (model in top_models) {
  if (model %in% names(best_models)) {
    workflow <- extract_workflow(best_models[[model]])
    fit <- extract_fit_parsnip(workflow)
    
    if ("ranger" %in% class(fit$fit) | "xgb.Booster" %in% class(fit$fit)) {
      p <- vip(fit) + 
        ggtitle(paste("Variable Importance -", model))
      print(p)
    }
  }
}

Show the code
# Final predictions comparison
all_predictions <- best_models %>%
  map_dfr(~collect_predictions(.x), .id = "wflow_id") %>%
  left_join(final_metrics %>% select(wflow_id, rmse = .estimate), by = "wflow_id") %>%
  mutate(wflow_id = fct_reorder(wflow_id, rmse))

ggplot(all_predictions, aes(x = sale_price, y = .pred, color = wflow_id)) +
  geom_point(alpha = 0.4) +
  geom_abline(intercept = 0, slope = 1, color = "black") +
  facet_wrap(~wflow_id, scales = "free") +
  labs(title = "Actual vs Predicted Sale Prices by Model",
       x = "Actual Price (log)",
       y = "Predicted Price (log)") +
  theme_minimal() +
  theme(legend.position = "none")

Show the code
# Model performance comparison
performance_plot <- final_metrics %>%
  mutate(wflow_id = fct_reorder(wflow_id, .estimate)) %>%
  ggplot(aes(x = wflow_id, y = .estimate, fill = wflow_id)) +
  geom_col() +
  coord_flip() +
  labs(title = "Model Performance Comparison (RMSE)",
       x = "Model",
       y = "RMSE") +
  theme_minimal() +
  theme(legend.position = "none")

performance_plot

Show the code
load("best_models.Rdata")
best_results <- tune_results |>
  split(~wflow_id) |>
  map(\(x) extract_workflow_set_result(x, id = x$wflow_id) |> select_best(metric = "rmse"))

test_metrics <- best_models |> 
  map_dfr(~collect_metrics(.x), .id = "mod") |> 
  select(mod, .metric, .estimate) |> 
  rename(test = .estimate)

validation_metrics <- tune_results |> 
  rank_results(select_best = TRUE) |> 
  select(wflow_id, .metric, mean) |> 
  rename(mod = wflow_id, validation = mean)

por_test_valid <- test_metrics |>
  full_join(validation_metrics, by = c("mod", ".metric")) |>
  filter(!(is.na(test) & is.na(validation))) |>
  pivot_wider(
    names_from = .metric, 
    values_from = c(test, validation)
  )

por_test_valid |> gt() |> 
  gt::tab_header(title = "Porównanie metryk: Test vs Walidacja")
Porównanie metryk: Test vs Walidacja
mod test_rmse test_rsq test_mae validation_rmse validation_rsq validation_mae
basic_knn 0.1810107 0.8041151 0.12080024 0.1781342 0.8065330 0.12219563
basic_lm 0.1355971 0.8860900 0.08891214 0.1468725 0.8637595 0.09352487
basic_rf 0.1424286 0.8759129 0.09221052 0.1395734 0.8838688 0.09329818
basic_svm 0.1362208 0.8851133 0.08602618 0.1395907 0.8800521 0.08900324
basic_xgb 0.1325482 0.8935040 0.09166778 0.1291433 0.8948408 0.08776987
Show the code
best_models |>
  map_dfr(~collect_metrics(.x), .id = "mod") |>
  ggplot(aes(mod, .estimate, color = mod)) +
  geom_point(size = 3) +
  facet_wrap(~.metric, scales = "free_y") +
  theme_minimal() +
  labs(x = "Model", y = "Wartość metryki", title = "Porównanie modeli")

12 Plots

Show the code
# Best compact version
best_plot_data <- test_metrics |>
  full_join(validation_metrics, by = c("mod", ".metric")) |>
  filter(.metric %in% c("rmse", "rsq")) |>
  pivot_longer(
    cols = c(test, validation),
    names_to = "dataset",
    values_to = "value"
  )

# Combined bar plot
p_combined <- ggplot(best_plot_data, aes(x = mod, y = value, fill = dataset)) +
  geom_col(position = position_dodge(width = 0.8), width = 0.7) +
  facet_wrap(~ .metric, scales = "free_y", 
             labeller = as_labeller(c(rmse = "RMSE", rsq = "R²"))) +
  labs(
    title = "Porównanie metryk RMSE i R²: Test vs Walidacja",
    x = "Model",
    y = "Wartość metryki",
    fill = "Zbiór danych"
  ) +
  theme_bw() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    legend.position = "top"
  ) +
  scale_fill_manual(values = c("validation" = "#E41A1C", "test" = "#377EB8"))

print(p_combined)

Show the code
# Difference plot
diff_plot <- best_plot_data |>
  pivot_wider(names_from = dataset, values_from = value) |>
  mutate(difference = test - validation) |>
  ggplot(aes(x = mod, y = difference, fill = difference > 0)) +
  geom_col() +
  facet_wrap(~ .metric, scales = "free_y") +
  labs(
    title = "Różnica między testem a walidacją (Test - Walidacja)",
    x = "Model",
    y = "Różnica",
    fill = "Test > Walidacja"
  ) +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  scale_fill_manual(values = c("TRUE" = "red", "FALSE" = "blue")) +
  geom_hline(yintercept = 0, linetype = "dashed")

print(diff_plot)